Using Libaries:
# Import the pandas package, then use the "read_csv" function to read the labeled training data
import pandas as pd
train = pd.read_csv("./data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
# Checking the shape of the read data
train.shape
# Check column header value
train.columns.values
# Sample review output
print(train["review"][3][0:500]+"......")
The output is filled with messy HTML tags, i.e. <br/>.
%%capture --no-stdout
# Fix Output
from bs4 import BeautifulSoup
example1 = BeautifulSoup(train["review"][3])
print(example1.get_text()[0:500]+"......")
For our first try, we're removing all the numbers & punctuations.
#Removing everything but letters.
import re
letters_only = re.sub("[^a-zA-Z ]", # The pattern to search for
"", # The pattern to replace it with
example1.get_text() ) # The text to search
print(letters_only[0:500]+"......")
Then we need to preform 'tokenization' - basically moving all the words into a lowered form & splitting them into individual tokens.
#Tokenization
lower_case = letters_only.lower()
words = lower_case.split()
Then we have to remove stop words - words that are commonly used without meaning.
# Showing stop words
from nltk.corpus import stopwords # Import the stop word list
print(stopwords.words("english")[0:15])
# Using downloaded text data sets to remove stop words
words = [w for w in words if not w in stopwords.words("english")]
print (" ".join( words)[0:500]+"......")
This is the final result we want.
Now, we put all these steps into one single function.
# Defining new Function
def review_to_words( raw_review ):
# 1. Remove HTML
review_text = BeautifulSoup(raw_review).get_text()
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z ]", "", review_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. List to Set
stops = set(stopwords.words("english"))
#
# 5. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return( " ".join( meaningful_words ))
%%capture --no-stdout
# Testing Function
print(review_to_words(train["review"][3])[0:500]+"......")
%%capture --no-stdout
# Fix all reviews
num_reviews = train["review"].size
clean_train_reviews = []
for i in range(0,num_reviews):
if( (i+1)%1000 == 0 ):
print ("Review %d of %d; " % ( i+1, num_reviews ), "", end="")
clean_train_reviews.append(review_to_words(train["review"][i]))
print("\n\n"+clean_train_reviews[3][0:500]+"......")
Using CountVectorizer from sklearn, we can count and find the most used tokens (or words).
from sklearn.feature_extraction.text import CountVectorizer
# Bag of Words using CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,\
stop_words = None, max_features = 5000)
# Fit Transform changes the vector into feature vectors.
train_data_features = vectorizer.fit_transform(clean_train_reviews).toarray()
# Check result
print(train_data_features.shape)
# Show Vocabs
vocab = vectorizer.get_feature_names()
print(vocab[0:30])
import numpy as np
# Count each word
dist = np.sum(train_data_features, axis=0)
for tag, count in zip(vocab[0:25], dist[0:25]):
print( "(",tag,", ",count,") ", end="")
Using RandomForestClassifier from sklearn, we can generate a better classifier through finding the decision tree of various random subset-results and ranking them.
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit( train_data_features, train["sentiment"] )
%%capture --no-stdout
# Read and process test data
test = pd.read_csv("./data/testData.tsv", header=0, delimiter="\t", \
quoting=3 )
num_reviews = len(test["review"])
clean_test_reviews = []
for i in range(0,num_reviews):
if( (i+1) % 1000 == 0 ):
print ("Review %d of %d; " % ( i+1, num_reviews ), "", end="")
clean_review = review_to_words( test["review"][i] )
clean_test_reviews.append( clean_review )
# Use Bag of Word on the Test Data
test_data_features = vectorizer.transform(clean_test_reviews).toarray()
# Use random forest to do predictions
result = forest.predict(test_data_features)
# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# Use pandas to write the comma-separated output file
output.to_csv( "./output/Bag_of_Words_model.csv", index=False, quoting=3 )
output.head(10)
With this being a kaggle dataset, we know that the result has a 0.84136 accuricy rate.
